Setup

Load relevant packages

This should be done whenever you start a new r session. For this script we add several more functions that are useful for data processing, quality control, and figure plotting.

library(minfi)
library(MASS)
library(abind)
library(sva)
library(Hmisc)
library(ggplot2)

Setting file paths for data, output, scripts

# Setting file paths for data, output, scripts

# The rest of this script assumes that your data are in a folder called "project" on the Cloud.
# It also assumes that your ouptut will be stored in a subfolder called "output" on the Cloud.
# As you work on your own computer, you will need to specify the folder locations.

# Folder location of the data files
data_dir <- "/cloud/project/Data"
data_dir
## [1] "/cloud/project/Data"
# Folder location to put the output files
output_dir <- "/cloud/project/output/"
output_dir
## [1] "/cloud/project/output/"

Initial pre-processing quality control steps

There are various quality control steps that need to be executed before we can say that the data is clean

load(file.path(data_dir, "RGset.rda"))
pd <- pData(RGset)

Extract methylated and unmethylated signals

Here we extract the signal intensity from each sample. Low signal intensity is sign of a poor sample. Additionally, we can stratify signal intensity by variables such as batch to get an idea about the technical noise in our sample.

# MethylSet (Mset) contains metylated and unmethylated signals made using preprocessRaw()
rawMSet <- preprocessRaw(RGset)
## Loading required package: IlluminaHumanMethylation450kmanifest
rawMSet
## class: MethylSet 
## dim: 485512 17 
## metadata(0):
## assays(2): Meth Unmeth
## rownames(485512): cg00050873 cg00212031 ... ch.22.47579720R
##   ch.22.48274842R
## rowData names(0):
## colnames(17): GSM1051870_7766130158_R03C02 GSM1052024_5730053010_R01C02
##   ... GSM1052032_5730053011_R06C01 GSM1052037_5730053011_R06C02
## colData names(11): GEOID celltype ... Batch filenames
## Annotation
##   array: IlluminaHumanMethylation450k
##   annotation: ilmn12.hg19
## Preprocessing
##   Method: Raw (no normalization or bg correction)
##   minfi version: 1.36.0
##   Manifest version: 0.4.0
# save(rawMSet, file = "rawMSet.rda")

# M signal per probe, per sample
Meth <- getMeth(rawMSet)
Meth[1:5, 1:5]
##            GSM1051870_7766130158_R03C02 GSM1052024_5730053010_R01C02
## cg00050873                          514                          270
## cg00212031                          170                          400
## cg00213748                          312                          490
## cg00214611                          181                          349
## cg00455876                          295                          497
##            GSM1052035_5730053011_R04C02 GSM1051874_7766130166_R02C01
## cg00050873                        26002                         1073
## cg00212031                          342                          242
## cg00213748                         1997                          237
## cg00214611                          312                          349
## cg00455876                         5458                          681
##            GSM1051871_7766130158_R05C02
## cg00050873                          396
## cg00212031                          420
## cg00213748                          286
## cg00214611                          262
## cg00455876                          470
# U signal per probe, per sample
Unmeth <- getUnmeth(rawMSet)
Unmeth[1:5, 1:5]
##            GSM1051870_7766130158_R03C02 GSM1052024_5730053010_R01C02
## cg00050873                          432                          348
## cg00212031                          494                          463
## cg00213748                          299                          476
## cg00214611                          362                          459
## cg00455876                         1183                          922
##            GSM1052035_5730053011_R04C02 GSM1051874_7766130166_R02C01
## cg00050873                         5595                          571
## cg00212031                         8375                          272
## cg00213748                          688                          358
## cg00214611                         5644                          269
## cg00455876                         3139                         1248
##            GSM1051871_7766130158_R05C02
## cg00050873                          355
## cg00212031                          478
## cg00213748                          301
## cg00214611                          576
## cg00455876                         1312

Visualize raw intensities

# Set up color palettes for plotting
myColors <- c("dodgerblue", "firebrick1", "seagreen3")
graphColors <- c(
  "#023FA5", "#7D87B9", "#BEC1D4", "#D6BCC0", "#BB7784", "#D33F6A", "#11C638", "#8DD593", "#C6DEC7",
  "#EAD3C6", "#F0B98D", "#EF9708", "#0FCFC0", "#9CDED6", "#D5EAE7", "#F3E1EB", "#F6C4E1", "#F79CD4",
  "#4A6FE3", "#8595E1", "#B5BBE3", "#E6AFB9", "#E07B91"
)
# Overall intensity: M vs. U
pd$MQC <- log2(colMedians(Meth))
pd$UQC <- log2(colMedians(Unmeth))

pd$Array <- factor(pd$Array)

Raw intensities

palette(graphColors)
plot(pd$UQC, pd$MQC, main = "M vs. U QC", pch = 16, xlab = "Log2 Median Unmethylated Intensity", ylab = "Log2 Median Methylated Intensity", cex.lab = 1.2, cex.main = 2)

Colored by slide

plot(pd$UQC, pd$MQC, col = as.factor(pd$Slide), main = "M vs. U QC by Slide", pch = 16, xlab = "Log2 Median Unmethylated Intensity", ylab = "Log2 Median Methylated Intensity", cex.lab = 1.2, cex.main = 2)
legend("bottomright", levels(as.factor(pd$Slide)), fill = graphColors)

Colored by position

plot(pd$UQC, pd$MQC, col = pd$Array, main = "M vs. U QC by Position", pch = 16, xlab = "Log2 Median Unmethylated Intensity", ylab = "Log2 Median Methylated Intensity", cex.lab = 1.2, cex.main = 2)
legend("bottomright", levels(pd$Array), fill = graphColors)

Colored by batch

palette(myColors)
plot(pd$UQC, pd$MQC, col = pd$Batch, main = "M vs. U QC by Batch", pch = 16, xlab = "Log2 Median Unmethylated Intensity", ylab = "Log2 Median Methylated Intensity", cex.lab = 1.2, cex.main = 2)
legend("bottomright", c("Batch 1", "Batch 2"), fill = myColors)

rm(Meth, Unmeth)

Drop or flag low intensity samples

Here we have no low intensity samples. Also note that the cutpoint of 11 for UQC and MQC is not set in stone as that value; it may depend on your data.

# Drop (or if really small sample: watch out for): Samples with UQC<11 & MQC<11
# Note the cutoff value (here, 11) would depend on your data and array (EPIC/450k)
which(pd$UQC < 11)
## integer(0)
length(which(pd$UQC < 11))
## [1] 0
which(pd$MQC < 11)
## integer(0)
length(which(pd$MQC < 11))
## [1] 0

Check sex

We can check conflicts between the reported and predicted sex of each sample. Predicted sex is derived from looking at the signal intensities of the sex chromosomes. This can help us to identify contaminated sample and poorly assayed samples.

GmRawSet <- mapToGenome(rawMSet)
## Loading required package: IlluminaHumanMethylation450kanno.ilmn12.hg19
sex <- getSex(GmRawSet)
colData(GmRawSet) <- sex
# save(sex, file=file.path(data_dir, "Estimate-Sex.rda"))
pd <- merge(pd, as.matrix(sex), by = "row.names", sort = FALSE)
rownames(pd) <- pd$Basename
pd <- pd[, -1]
table(pd$predictedSex, pd$gender)
##    
##      F  M
##   F 10  0
##   M  0  7
plotSex(GmRawSet)

rm(GmRawSet, sex)

Raw density plot

# Raw density plot
beta.raw <- getBeta(rawMSet)
mvalue.raw <- getM(rawMSet)
type <- getProbeType(rawMSet)
probe.type <- data.frame(Name = rownames(beta.raw), Type = type)

Beta density by case status

densityPlot(beta.raw, sampGroups = pd$casestatus, main = "Raw Beta by Case Status")

Beta density by batch

densityPlot(beta.raw, sampGroups = pd$Batch, main = "Raw Beta by Batch")

Beta density by probe type

plotBetasByType(beta.raw[, 1], probeTypes = probe.type, main = "Raw Beta by Probe Type, Sample 1")

Mvalue density

densityPlot(mvalue.raw, sampGroups = pd$Batch, main = "Raw M-value by Batch", xlab = "M-value")

Calculate principal components on raw data

Principal components show us main driving variables behind variation in the DNA methylation data

beta.raw2 <- beta.raw
beta.raw2[is.na(beta.raw2)] <- 0

rm(rawMSet, beta.raw, mvalue.raw, type, probe.type)

# makes the principal component object
prin <- prcomp(t(beta.raw2), center = T, scale. = F)
# save(prin, file=file.path(data_dir, "princomp.rda"))

# pulls out proportion of variance explained by each PC
out.var <- prin$sdev^2 / sum(prin$sdev^2) # the percent variance at each PC
out.var[1:10]
##  [1] 0.32829751 0.14178446 0.09810244 0.06415463 0.04391130 0.03810846
##  [7] 0.03432127 0.03244897 0.03101745 0.03058116
sum(out.var)
## [1] 1

Raw principal component plots

# Raw PCA plots
screeplot(prin, col = "dodgerblue", xlab = "Principal Components of Raw Beta Values", main = "", cex.lab = 1.3)

Components by sex

plot.new()
palette(myColors)
legend("bottom", legend = levels(factor(pd$gender)), fill = myColors, title = "Principal components by sex")

pairs(prin$x[, 1:6], col = factor(pd$gender), labels = paste0("PC", 1:6), pch = 1, cex = 0.5)

Components by batch

plot.new()
palette(myColors)
legend("bottom", legend = levels(factor(pd$Batch)), fill = myColors, title = "Principal components by batch")

pairs(prin$x[, 1:6], col = factor(pd$Batch), labels = paste0("PC", 1:6), pch = 1, cex = 0.5)

Components by array position

plot.new()
palette(myColors)
legend("bottom", legend = levels(factor(pd$Array)), fill = graphColors, title = "Principal components by array")

pairs(prin$x[, 1:6], col = factor(pd$Array), labels = paste0("PC", 1:6), pch = 1, cex = 0.5)

Components by slide

plot.new()
palette(myColors)
legend("bottom", legend = levels(factor(pd$Slide)), fill = myColors, title = "Principal components by slide")

pairs(prin$x[, 1:6], col = factor(pd$Slide), labels = paste0("PC", 1:6), pch = 1, cex = 0.5)

rm(phenos, prin, beta.raw2, out.var)

Detection P

DetectionP gives us a metric for assessing signal/noise ratios. Both samples and probes with higher detection p levels (indicative of unacceptable levels of noise) are cut or flagged.

detP <- detectionP(RGset)
dim(detP)
## [1] 485512     17
detP[1:5, 1:5]
##            GSM1051870_7766130158_R03C02 GSM1052024_5730053010_R01C02
## cg00050873                 1.028592e-02                  0.960841448
## cg00212031                 2.324817e-01                  0.640617366
## cg00213748                 3.325807e-01                  0.409574003
## cg00214611                 4.798274e-01                  0.750000321
## cg00455876                 5.603033e-08                  0.002416194
##            GSM1052035_5730053011_R04C02 GSM1051874_7766130166_R02C01
## cg00050873                 0.000000e+00                 2.829189e-14
## cg00212031                 0.000000e+00                 6.031609e-01
## cg00213748                 7.477842e-18                 3.836341e-01
## cg00214611                9.323309e-150                 3.248249e-01
## cg00455876                 0.000000e+00                 1.304945e-21
##            GSM1051871_7766130158_R05C02
## cg00050873                 2.420704e-01
## cg00212031                 7.499246e-02
## cg00213748                 5.500699e-01
## cg00214611                 1.276492e-01
## cg00455876                 1.940226e-09
# save(detP,file=file.path(data_dir, "detection-P.rda"))
failed <- detP > 0.01
per.samp <- colMeans(failed) # Fraction of failed positions per sample (length: 42)
summary(per.samp)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0002245 0.0006900 0.0009124 0.0028833 0.0026343 0.0183827
per.probe <- rowMeans(failed) # Fraction of failed samples per position (length: 485512)
summary(per.probe)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.002883 0.000000 0.882353
sum(per.samp > 0.01) # How many samples had more than 1% of sites fail?
## [1] 2
sum(per.probe > 0.1) # How many positions failed in >10% of samples?
## [1] 5049
# Identify samples that are failed
probe.fail <- failed[per.probe > 0.1, ]
length(probe.fail)
## [1] 85833
sample.fail <- per.samp[per.samp > 0.01]
sample.fail
## GSM1052032_5730053011_R06C01 GSM1052037_5730053011_R06C02 
##                   0.01151980                   0.01838266

Plot detection P

hist(per.samp, breaks = 40, col = "dodgerblue", cex.lab = 1.3, xlab = "Fraction of failed positions per sample")
abline(v = 0.01, col = "red")

hist(per.probe, breaks = 40, col = "dodgerblue", cex.lab = 1.3, xlab = "Fraction of failed samples per probe", ylim = c(0, 500))
abline(v = 0.1, col = "red", lwd = 2)

Drop problem samples

RGset.drop <- RGset[, !colnames(RGset) %in% names(sample.fail)] # Drop samples that failed by detection p.
dim(RGset.drop)
## [1] 622399     15
rm(RGset, detP, failed, per.samp, per.probe)

Cross-reactive probes

Cross-reactive probes are shown to ‘co-hybridize’ onto sex chromosomes and may show spurious sex differences in methylation that are merely due to technical artifacts

load(file.path(data_dir, "cross.probes.info.rda"))
dim(cross.probes.info)
## [1] 29233     5
head(cross.probes.info)
##     TargetID 47 48 49 50
## 1 cg00001510  2  0  1  0
## 2 cg00003969  0  0  2  0
## 3 cg00004121  0  4  1  0
## 4 cg00004192  0  1  0  0
## 5 cg00004209  0  2  1  0
## 6 cg00004771  3  0  0  0
# rec is to use the 47 cross probe cutoff

Noob preprocessing

Noob uses control probes to correct for background flouresence

# Noob
noob <- preprocessNoob(RGset.drop, offset = 15, dyeCorr = TRUE, verbose = TRUE)
## [dyeCorrection] Applying R/G ratio flip to fix dye bias
noob.dropP <- noob[!rownames(noob) %in% rownames(probe.fail), ]
noob.dropCross <- noob.dropP[!rownames(noob.dropP) %in% cross.probes.info$TargetID, ]
noob <- noob.dropCross

save(noob, file = file.path(data_dir, "noob.rda"))

rm(noob.dropP, noob.dropCross, cross.probes.info)

beta.n <- getBeta(noob)
pd <- pData(noob)

rm(noob)

Gap probes

Gap probes are probes for which the methylation beta clusters into discrete groups– these typically have their methylation driven by underlying SNPs

gaps <- gaphunter(beta.n)
## [gaphunter] Using 451,445 probes and 15 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found 80,470 gap signals.
## [gaphunter] Filtering out gap signals driven by outliers.
## [gaphunter] Removed 0 gap signals driven by outliers from results.
str(gaps)
## List of 3
##  $ proberesults :'data.frame':   80470 obs. of  10 variables:
##   ..$ Groups: num [1:80470] 2 2 2 4 8 3 2 2 2 2 ...
##   ..$ Group1: num [1:80470] 6 6 7 1 4 1 5 6 6 6 ...
##   ..$ Group2: num [1:80470] 9 9 8 1 2 12 10 9 9 9 ...
##   ..$ Group3: num [1:80470] 0 0 0 7 3 2 0 0 0 0 ...
##   ..$ Group4: num [1:80470] 0 0 0 6 1 0 0 0 0 0 ...
##   ..$ Group5: num [1:80470] 0 0 0 0 1 0 0 0 0 0 ...
##   ..$ Group6: num [1:80470] 0 0 0 0 2 0 0 0 0 0 ...
##   ..$ Group7: num [1:80470] 0 0 0 0 1 0 0 0 0 0 ...
##   ..$ Group8: num [1:80470] 0 0 0 0 1 0 0 0 0 0 ...
##   ..$ Group9: num [1:80470] 0 0 0 0 0 0 0 0 0 0 ...
##  $ sampleresults: num [1:80470, 1:15] 2 2 2 3 1 2 2 2 2 2 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:80470] "cg01707559" "cg03244189" "cg04689676" "cg04964672" ...
##   .. ..$ : NULL
##  $ algorithm    :List of 3
##   ..$ threshold   : num 0.05
##   ..$ outCutoff   : num 0.01
##   ..$ keepOutliers: logi FALSE
#Get a probe in the list as an example and plot the beta distribution.

gap1 <- beta.n[rownames(gaps$sampleresults)[42],]
gap1 <- data.frame(gap1)

ggplot(gap1, aes(x = rownames(gaps$sampleresults)[42], y = gap1)) +
  geom_jitter(width = 0.01) +
  labs(title = sprintf("Methylation Beta Values for probe %s", rownames(gaps$sampleresults)[42]), x = " ", y = "beta") +
  coord_cartesian(ylim = c(0, 1)) +
  theme_bw()

Cell type proportions

Cell type proportions are a strong confounder of DNA methylation. Different cell types have varying levels of methylation in different probes so we can use these probes to disentangle cell types proportions from unrelated DNA methylation differences.

# Cell type

# library(FlowSorted.Blood.450k)
# cell <- estimateCellCounts(RGset.drop)
# save(cell, file=file.path(data_dir,"cell-type-estimates.rda"))

# cell estimation may take more memory than available in RStudio Cloud, if an issue load premade cell type estimates
load(file.path(data_dir, "Premade_Intermediate_Files/cell-type-estimates.rda"))

dim(cell)
## [1] 15  6
head(cell)
##                                    CD8T        CD4T         NK      Bcell
## GSM1051870_7766130158_R03C02 0.04188973 0.084594790 0.05023918 0.04773192
## GSM1052024_5730053010_R01C02 0.11315767 0.125408728 0.03213701 0.05810510
## GSM1052035_5730053011_R04C02 0.17453771 0.175923277 0.03048209 0.10661716
## GSM1051874_7766130166_R02C01 0.08986933 0.165085488 0.02269505 0.04247854
## GSM1051871_7766130158_R05C02 0.04901293 0.070227977 0.04387917 0.03731421
## GSM1051872_7766130158_R06C02 0.02868823 0.006123224 0.01649349 0.02748160
##                                    Mono      Gran
## GSM1051870_7766130158_R03C02 0.07119778 0.7225386
## GSM1052024_5730053010_R01C02 0.09360675 0.6262584
## GSM1052035_5730053011_R04C02 0.06070866 0.5079184
## GSM1051874_7766130166_R02C01 0.09390137 0.5949493
## GSM1051871_7766130158_R05C02 0.07663316 0.7413125
## GSM1051872_7766130158_R06C02 0.03591401 0.9019293
identical(rownames(cell), colnames(beta.n)) # A quick sanity check to make sure our samples are in the right order.
## [1] TRUE
pd.n <- data.frame(pd, cell)

rm(RGset.drop)

Principal components on cell proportions

prin.cell <- prcomp(t(cell), center = T, scale. = F)
out.var <- prin.cell$sdev^2 / sum(prin.cell$sdev^2)
out.var
## [1] 9.834555e-01 1.149609e-02 3.112862e-03 1.457190e-03 4.783495e-04
## [6] 3.776973e-33
screeplot(prin.cell, col = "dodgerblue", xlab = "Principal Components of Estimated Cell Type", main = "", cex.lab = 1.3)

myColors <- c("seagreen3", "dodgerblue", "darkorchid", "firebrick1", "darkorange", "khaki1")
palette(myColors)

par(mar = c(0, 0, 0, 0))
plot.new()
legend("bottom", c("Bcell", "CD4T", "CD8T", "Gran", "Mono", "NK"), fill = myColors, title = "Principal Components by Estimated Cell Type")

pairs(prin.cell$x[, 1:6], col = as.factor(rownames(prin.cell$x)), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 15)

summary(pd.n$Gran)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5079  0.6106  0.7225  0.6965  0.7389  0.9019
hist(pd.n$Gran, breaks = 20, col = "dodgerblue", xlab = "Estimated percent granulocytes", cex.lab = 1.3)

hist(pd.n$Mono, breaks = 20, col = "dodgerblue", xlab = "Estimated percent monocytes", cex.lab = 1.3)

hist(pd.n$NK, breaks = 20, col = "dodgerblue", xlab = "Estimated percent NK cells", cex.lab = 1.3)

hist(pd.n$CD8T, breaks = 20, col = "dodgerblue", xlab = "Estimated percent CD8T", cex.lab = 1.3)

hist(pd.n$CD4T, breaks = 20, col = "dodgerblue", xlab = "Estimated percent CD4T", cex.lab = 1.3)

Drop samples with problematic cell proportions

# Pick cutoffs for biological ranges for cell type estimates.
pd.n.drop <- pd.n[pd.n$Mono < 0.2, ]
beta.n <- beta.n[, colnames(beta.n) %in% rownames(pd.n.drop)]
dim(beta.n)
## [1] 451445     15

Principal components on noob preprocessed data

# Noob PCA plots
pd.n.drop$gran.quart <- cut2(pd.n.drop$Gran, g = 4)

prin <- prcomp(t(beta.n), center = T, scale. = F)
out.var <- prin$sdev^2 / sum(prin$sdev^2)
out.var[1:10]
##  [1] 0.27005828 0.18834184 0.09414191 0.06102141 0.04786377 0.04435116
##  [7] 0.04268365 0.04147590 0.03885570 0.03711819

Plot principal components on noob data

screeplot(prin, col = "dodgerblue", xlab = "Principal Components of Noob Beta Values", main = "", cex.lab = 1.3)

By sex

plot.new()
palette(myColors)
par(mar = c(0, 0, 0, 0))
legend("bottom", levels(as.factor(pd.n.drop$gender)), fill = myColors, title = "Principal Components by Sex")

pairs(prin$x[, 1:6], col = as.factor(pd.n.drop$gender), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)

By batch

plot.new()
legend("bottom", levels(as.factor(pd.n.drop$Batch)), fill = myColors, title = "Principal Components by Batch")

pairs(prin$x[, 1:6], col = as.factor(pd.n.drop$Batch), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)

By case status

plot.new()
legend("bottom", levels(as.factor(pd.n.drop$casestatus)), fill = myColors, title = "Principal Components by Case Status")

pairs(prin$x[, 1:6], col = as.factor(pd.n.drop$casestatus), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)

By array position

plot.new()
palette(graphColors)
legend("bottom", levels(as.factor(pd.n.drop$Array)), fill = graphColors, title = "Principal Components by Array Position")

pairs(prin$x[, 1:6], col = as.factor(pd.n.drop$Array), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)

By slide

plot.new()
legend("bottom", levels(as.factor(pd.n.drop$Slide)), fill = graphColors, title = "Principal Components by Slide")

pairs(prin$x[, 1:6], col = as.factor(pd.n.drop$Slide), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)

By granulocyte proportion quartile

plot.new()
legend("bottom", levels(as.factor(pd.n.drop$gran.quart)), fill = graphColors, title = "Principal Components by Quartile of Gran")

pairs(prin$x[, 1:6], col = as.factor(pd.n.drop$gran.quart), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)

Drop samples with missing data at key covariates

# Drop samples with missing data at key covariates
table(pd.n.drop$gender, useNA = "always")
## 
##    F    M <NA> 
##    9    6    0
table(pd.n.drop$casestatus, useNA = "always")
## 
## Control      RA    <NA> 
##       5      10       0
table(pd.n.drop$smoking, useNA = "always")
## 
##    current         ex      never occasional       <NA> 
##          4          5          4          2          0
table(pd.n.drop$Batch, useNA = "always")
## 
##    1    2 <NA> 
##    5   10    0
# No samples missing data, we're okay. But run this line anyway.
pd.complete <- pd.n.drop[!(is.na(pd.n.drop$smoking)), ] # Example of how to remove.
save(pd.complete, file = file.path(data_dir, "pd-complete.rda"))

Use Combat to adjust for batch effects

beta.noob.complete <- beta.n[, colnames(beta.n) %in% rownames(pd.complete)]
rm(beta.n)
mod <- model.matrix(~ pd.complete$gender + pd.complete$casestatus + pd.complete$smoking)

# combat.beta <- ComBat(dat = beta.noob.complete, batch = pd.complete$Batch, mod = mod)

# if ComBat requires more memory than available, load premade results:
load(file.path(data_dir, "Premade_Intermediate_Files/combat-beta.rda"))
combat.beta[1:5, 1:5]
##            GSM1051870_7766130158_R03C02 GSM1052024_5730053010_R01C02
## cg01707559                    0.1597848                    0.1462628
## cg03244189                    0.1943061                    0.1996871
## cg04689676                    0.1552875                    0.1546703
## cg04964672                    0.6965052                    0.6376027
## cg13851368                    0.2522998                    0.2511689
##            GSM1052035_5730053011_R04C02 GSM1051874_7766130166_R02C01
## cg01707559                   0.05731516                    0.2056190
## cg03244189                   0.08506135                    0.2456049
## cg04689676                   0.03855749                    0.1420821
## cg04964672                   0.88477145                    0.5154691
## cg13851368                   0.80565715                    0.3034519
##            GSM1051871_7766130158_R05C02
## cg01707559                    0.2304606
## cg03244189                    0.2356330
## cg04689676                    0.1885954
## cg04964672                    0.6739231
## cg13851368                    0.2614161
# save the cleaned matrix.
save(combat.beta, file = file.path(data_dir, "combat-beta.rda"))

prin <- prcomp(t(combat.beta), center = T, scale. = F)
out.var <- prin$sdev^2 / sum(prin$sdev^2)
out.var[1:10]
##  [1] 0.24201673 0.18800841 0.09044395 0.07922183 0.06261984 0.05290151
##  [7] 0.04745881 0.04666915 0.04532922 0.04157885

Plot principal components of combat data

screeplot(prin, col = "dodgerblue", xlab = "Principal Components of Noob Beta Values", main = "", cex.lab = 1.3)

By sex

plot.new()
palette(myColors)
par(mar = c(0, 0, 0, 0))
legend("bottom", levels(as.factor(pd.complete$gender)), fill = myColors, title = "Principal Components by Sex")

pairs(prin$x[, 1:6], col = as.factor(pd.complete$gender), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)

By batch

plot.new()
legend("bottom", levels(as.factor(pd.complete$Batch)), fill = myColors, title = "Principal Components by Batch")

pairs(prin$x[, 1:6], col = as.factor(pd.complete$Batch), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)

By case status

plot.new()
legend("bottom", levels(as.factor(pd.complete$casestatus)), fill = myColors, title = "Principal Components by Case Status")

pairs(prin$x[, 1:6], col = as.factor(pd.complete$casestatus), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)

By array position

plot.new()
palette(graphColors)
legend("bottom", levels(as.factor(pd.complete$Array)), fill = graphColors, title = "Principal Components by Array Position")

pairs(prin$x[, 1:6], col = as.factor(pd.complete$Array), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)

By slide

plot.new()
legend("bottom", levels(as.factor(pd.complete$Slide)), fill = graphColors, title = "Principal Components by Slide")

pairs(prin$x[, 1:6], col = as.factor(pd.complete$Slide), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)

By quartile of granulocyte proportion

plot.new()
legend("bottom", levels(as.factor(pd.complete$gran.quart)), fill = graphColors, title = "Principal Components by Quartile of Gran")

pairs(prin$x[, 1:6], col = as.factor(pd.complete$gran.quart), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)